In [48]:
#pip install folium
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [5]:
df=pd.read_csv('Dataset .csv')
df
Out[5]:
Restaurant ID Restaurant Name Country Code City Address Locality Locality Verbose Longitude Latitude Cuisines ... Currency Has Table booking Has Online delivery Is delivering now Switch to order menu Price range Aggregate rating Rating color Rating text Votes
0 6317637 Le Petit Souffle 162 Makati City Third Floor, Century City Mall, Kalayaan Avenu... Century City Mall, Poblacion, Makati City Century City Mall, Poblacion, Makati City, Mak... 121.027535 14.565443 French, Japanese, Desserts ... Botswana Pula(P) Yes No No No 3 4.8 Dark Green Excellent 314
1 6304287 Izakaya Kikufuji 162 Makati City Little Tokyo, 2277 Chino Roces Avenue, Legaspi... Little Tokyo, Legaspi Village, Makati City Little Tokyo, Legaspi Village, Makati City, Ma... 121.014101 14.553708 Japanese ... Botswana Pula(P) Yes No No No 3 4.5 Dark Green Excellent 591
2 6300002 Heat - Edsa Shangri-La 162 Mandaluyong City Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal... Edsa Shangri-La, Ortigas, Mandaluyong City Edsa Shangri-La, Ortigas, Mandaluyong City, Ma... 121.056831 14.581404 Seafood, Asian, Filipino, Indian ... Botswana Pula(P) Yes No No No 4 4.4 Green Very Good 270
3 6318506 Ooma 162 Mandaluyong City Third Floor, Mega Fashion Hall, SM Megamall, O... SM Megamall, Ortigas, Mandaluyong City SM Megamall, Ortigas, Mandaluyong City, Mandal... 121.056475 14.585318 Japanese, Sushi ... Botswana Pula(P) No No No No 4 4.9 Dark Green Excellent 365
4 6314302 Sambo Kojin 162 Mandaluyong City Third Floor, Mega Atrium, SM Megamall, Ortigas... SM Megamall, Ortigas, Mandaluyong City SM Megamall, Ortigas, Mandaluyong City, Mandal... 121.057508 14.584450 Japanese, Korean ... Botswana Pula(P) Yes No No No 4 4.8 Dark Green Excellent 229
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9546 5915730 Naml۱ Gurme 208 ��stanbul Kemanke�� Karamustafa Pa��a Mahallesi, R۱ht۱m ... Karak�_y Karak�_y, ��stanbul 28.977392 41.022793 Turkish ... Turkish Lira(TL) No No No No 3 4.1 Green Very Good 788
9547 5908749 Ceviz A��ac۱ 208 ��stanbul Ko��uyolu Mahallesi, Muhittin ��st�_nda�� Cadd... Ko��uyolu Ko��uyolu, ��stanbul 29.041297 41.009847 World Cuisine, Patisserie, Cafe ... Turkish Lira(TL) No No No No 3 4.2 Green Very Good 1034
9548 5915807 Huqqa 208 ��stanbul Kuru�_e��me Mahallesi, Muallim Naci Caddesi, N... Kuru�_e��me Kuru�_e��me, ��stanbul 29.034640 41.055817 Italian, World Cuisine ... Turkish Lira(TL) No No No No 4 3.7 Yellow Good 661
9549 5916112 A���k Kahve 208 ��stanbul Kuru�_e��me Mahallesi, Muallim Naci Caddesi, N... Kuru�_e��me Kuru�_e��me, ��stanbul 29.036019 41.057979 Restaurant Cafe ... Turkish Lira(TL) No No No No 4 4.0 Green Very Good 901
9550 5927402 Walter's Coffee Roastery 208 ��stanbul Cafea��a Mahallesi, Bademalt۱ Sokak, No 21/B, ... Moda Moda, ��stanbul 29.026016 40.984776 Cafe ... Turkish Lira(TL) No No No No 2 4.0 Green Very Good 591

9551 rows × 21 columns

  • 1.Explore the dataset and identify the numberof rows and columns
In [7]:
df.head()
Out[7]:
Restaurant ID Restaurant Name Country Code City Address Locality Locality Verbose Longitude Latitude Cuisines ... Currency Has Table booking Has Online delivery Is delivering now Switch to order menu Price range Aggregate rating Rating color Rating text Votes
0 6317637 Le Petit Souffle 162 Makati City Third Floor, Century City Mall, Kalayaan Avenu... Century City Mall, Poblacion, Makati City Century City Mall, Poblacion, Makati City, Mak... 121.027535 14.565443 French, Japanese, Desserts ... Botswana Pula(P) Yes No No No 3 4.8 Dark Green Excellent 314
1 6304287 Izakaya Kikufuji 162 Makati City Little Tokyo, 2277 Chino Roces Avenue, Legaspi... Little Tokyo, Legaspi Village, Makati City Little Tokyo, Legaspi Village, Makati City, Ma... 121.014101 14.553708 Japanese ... Botswana Pula(P) Yes No No No 3 4.5 Dark Green Excellent 591
2 6300002 Heat - Edsa Shangri-La 162 Mandaluyong City Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal... Edsa Shangri-La, Ortigas, Mandaluyong City Edsa Shangri-La, Ortigas, Mandaluyong City, Ma... 121.056831 14.581404 Seafood, Asian, Filipino, Indian ... Botswana Pula(P) Yes No No No 4 4.4 Green Very Good 270
3 6318506 Ooma 162 Mandaluyong City Third Floor, Mega Fashion Hall, SM Megamall, O... SM Megamall, Ortigas, Mandaluyong City SM Megamall, Ortigas, Mandaluyong City, Mandal... 121.056475 14.585318 Japanese, Sushi ... Botswana Pula(P) No No No No 4 4.9 Dark Green Excellent 365
4 6314302 Sambo Kojin 162 Mandaluyong City Third Floor, Mega Atrium, SM Megamall, Ortigas... SM Megamall, Ortigas, Mandaluyong City SM Megamall, Ortigas, Mandaluyong City, Mandal... 121.057508 14.584450 Japanese, Korean ... Botswana Pula(P) Yes No No No 4 4.8 Dark Green Excellent 229

5 rows × 21 columns

In [9]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9551 entries, 0 to 9550
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Restaurant ID         9551 non-null   int64  
 1   Restaurant Name       9551 non-null   object 
 2   Country Code          9551 non-null   int64  
 3   City                  9551 non-null   object 
 4   Address               9551 non-null   object 
 5   Locality              9551 non-null   object 
 6   Locality Verbose      9551 non-null   object 
 7   Longitude             9551 non-null   float64
 8   Latitude              9551 non-null   float64
 9   Cuisines              9542 non-null   object 
 10  Average Cost for two  9551 non-null   int64  
 11  Currency              9551 non-null   object 
 12  Has Table booking     9551 non-null   object 
 13  Has Online delivery   9551 non-null   object 
 14  Is delivering now     9551 non-null   object 
 15  Switch to order menu  9551 non-null   object 
 16  Price range           9551 non-null   int64  
 17  Aggregate rating      9551 non-null   float64
 18  Rating color          9551 non-null   object 
 19  Rating text           9551 non-null   object 
 20  Votes                 9551 non-null   int64  
dtypes: float64(3), int64(5), object(13)
memory usage: 1.5+ MB
In [11]:
df.shape
Out[11]:
(9551, 21)
  • 9551 Rows & 21 Columns
  • 2 Check for missing values in each column and

handle them accordingly.

In [13]:
df.isnull().sum()
Out[13]:
Restaurant ID           0
Restaurant Name         0
Country Code            0
City                    0
Address                 0
Locality                0
Locality Verbose        0
Longitude               0
Latitude                0
Cuisines                9
Average Cost for two    0
Currency                0
Has Table booking       0
Has Online delivery     0
Is delivering now       0
Switch to order menu    0
Price range             0
Aggregate rating        0
Rating color            0
Rating text             0
Votes                   0
dtype: int64
  • There are only 9 missing values in 'Cuisine' column which is very less

  • So, we can ignore or just replace these with 'Not Specified'

In [15]:
df['Cuisines'].fillna('Not Specified',inplace=True)
C:\Users\dhana\AppData\Local\Temp\ipykernel_2304\3188408958.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Cuisines'].fillna('Not Specified',inplace=True)
In [17]:
df.isnull().sum()
Out[17]:
Restaurant ID           0
Restaurant Name         0
Country Code            0
City                    0
Address                 0
Locality                0
Locality Verbose        0
Longitude               0
Latitude                0
Cuisines                0
Average Cost for two    0
Currency                0
Has Table booking       0
Has Online delivery     0
Is delivering now       0
Switch to order menu    0
Price range             0
Aggregate rating        0
Rating color            0
Rating text             0
Votes                   0
dtype: int64
  • 3.Perform data type conversion if necessary. Analyze the distribution of the target variable ("Aggregate rating") and identify any class imbalances.
In [19]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9551 entries, 0 to 9550
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Restaurant ID         9551 non-null   int64  
 1   Restaurant Name       9551 non-null   object 
 2   Country Code          9551 non-null   int64  
 3   City                  9551 non-null   object 
 4   Address               9551 non-null   object 
 5   Locality              9551 non-null   object 
 6   Locality Verbose      9551 non-null   object 
 7   Longitude             9551 non-null   float64
 8   Latitude              9551 non-null   float64
 9   Cuisines              9551 non-null   object 
 10  Average Cost for two  9551 non-null   int64  
 11  Currency              9551 non-null   object 
 12  Has Table booking     9551 non-null   object 
 13  Has Online delivery   9551 non-null   object 
 14  Is delivering now     9551 non-null   object 
 15  Switch to order menu  9551 non-null   object 
 16  Price range           9551 non-null   int64  
 17  Aggregate rating      9551 non-null   float64
 18  Rating color          9551 non-null   object 
 19  Rating text           9551 non-null   object 
 20  Votes                 9551 non-null   int64  
dtypes: float64(3), int64(5), object(13)
memory usage: 1.5+ MB
  • No need to do any data type conversion
In [21]:
# target variables 'Aggregate rating'

target='Aggregate rating'

# descriptive sttistics 

print(df[target].describe())
count    9551.000000
mean        2.666370
std         1.516378
min         0.000000
25%         2.500000
50%         3.200000
75%         3.700000
max         4.900000
Name: Aggregate rating, dtype: float64
In [23]:
# Box plot

plt.figure(figsize=(8,5))
sns.boxplot(x=df[target])
plt.title('Box Plot')
plt.xlabel('Aggregative Rating')
plt.show()
No description has been provided for this image
In [25]:
# Histogram 

plt.figure(figsize=(8,5))
sns.histplot(df[target],bins=30,kde=True,color='green')
plt.title('Histogram')
plt.xlabel('Aggregate Rating')
plt.ylabel('Frequency')
plt.show()
No description has been provided for this image
  • No clas imbalance

Level 1 - Task 2 :-¶

Task : Descriptive Analysis¶

    1. Calculate basic statistical measures (mean,median,standard deviation,etc.) for numerical columns
In [27]:
df.describe()
Out[27]:
Restaurant ID Country Code Longitude Latitude Average Cost for two Price range Aggregate rating Votes
count 9.551000e+03 9551.000000 9551.000000 9551.000000 9551.000000 9551.000000 9551.000000 9551.000000
mean 9.051128e+06 18.365616 64.126574 25.854381 1199.210763 1.804837 2.666370 156.909748
std 8.791521e+06 56.750546 41.467058 11.007935 16121.183073 0.905609 1.516378 430.169145
min 5.300000e+01 1.000000 -157.948486 -41.330428 0.000000 1.000000 0.000000 0.000000
25% 3.019625e+05 1.000000 77.081343 28.478713 250.000000 1.000000 2.500000 5.000000
50% 6.004089e+06 1.000000 77.191964 28.570469 400.000000 2.000000 3.200000 31.000000
75% 1.835229e+07 1.000000 77.282006 28.642758 700.000000 2.000000 3.700000 131.000000
max 1.850065e+07 216.000000 174.832089 55.976980 800000.000000 4.000000 4.900000 10934.000000
In [29]:
numerical_stats = df.describe()
print(numerical_stats)
       Restaurant ID  Country Code    Longitude     Latitude  \
count   9.551000e+03   9551.000000  9551.000000  9551.000000   
mean    9.051128e+06     18.365616    64.126574    25.854381   
std     8.791521e+06     56.750546    41.467058    11.007935   
min     5.300000e+01      1.000000  -157.948486   -41.330428   
25%     3.019625e+05      1.000000    77.081343    28.478713   
50%     6.004089e+06      1.000000    77.191964    28.570469   
75%     1.835229e+07      1.000000    77.282006    28.642758   
max     1.850065e+07    216.000000   174.832089    55.976980   

       Average Cost for two  Price range  Aggregate rating         Votes  
count           9551.000000  9551.000000       9551.000000   9551.000000  
mean            1199.210763     1.804837          2.666370    156.909748  
std            16121.183073     0.905609          1.516378    430.169145  
min                0.000000     1.000000          0.000000      0.000000  
25%              250.000000     1.000000          2.500000      5.000000  
50%              400.000000     2.000000          3.200000     31.000000  
75%              700.000000     2.000000          3.700000    131.000000  
max           800000.000000     4.000000          4.900000  10934.000000  
In [31]:
df[['Average Cost for two','Price range','Aggregate rating','Votes']].describe()
Out[31]:
Average Cost for two Price range Aggregate rating Votes
count 9551.000000 9551.000000 9551.000000 9551.000000
mean 1199.210763 1.804837 2.666370 156.909748
std 16121.183073 0.905609 1.516378 430.169145
min 0.000000 1.000000 0.000000 0.000000
25% 250.000000 1.000000 2.500000 5.000000
50% 400.000000 2.000000 3.200000 31.000000
75% 700.000000 2.000000 3.700000 131.000000
max 800000.000000 4.000000 4.900000 10934.000000
  • 2 Explore the distribution of categorical

variables like "Country Code,

" "City,

" and

"Cuisines. "

Identify the top cuisines and cities with the highest number of restaurants.

In [33]:
# Explore the distribution of categorical variables like "Country Code

plt.figure(figsize=(8,5))
sns.countplot(x='Country Code',data=df)
plt.title('Distribution of Resturants by Country Code')
plt.xlabel('Country Code')
plt.ylabel('No of Resturants')
plt.show()
No description has been provided for this image
  • The majority of restaurants are located in Country Code 1.Followed by the second-highest concentration in Country Code 216
In [35]:
# Top Countries with the highest number of restaurants

top_countries=df['Country Code'].value_counts().head()
print('Top 5 Countries with the Highest number of restaurants')
print(top_countries)
Top 5 Countries with the Highest number of restaurants
Country Code
1      8652
216     434
215      80
30       60
214      60
Name: count, dtype: int64
In [37]:
# Explore the distribution of 'City'

plt.figure(figsize=(8,5))

sns.countplot(x='City',data=df,order=df['City'].value_counts().head(20).index)

plt.title('Distribution of Resturants by City')
plt.xlabel('City')
plt.ylabel('No of Resturants')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [39]:
# Explore the distribution of 'Cuisines'

plt.figure(figsize=(15,6))

cuisines_count=df['Cuisines'].value_counts()
cuisines_count.head(20).plot(kind='bar', color=sns.color_palette('Set2'))

plt.title('Top 20 Cuisines with the highest number of Restaurants')
plt.xlabel('Cuisines')
plt.ylabel('No of Resturants')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [41]:
# Top Cuisines and Cities
# Top cities with the highest number of restaurants

top_cities=df['City'].value_counts().head(10)

print('Top 10 cities with the Highest Number of Restaurants : ')
print(top_cities)
Top 10 cities with the Highest Number of Restaurants : 
City
New Delhi       5473
Gurgaon         1118
Noida           1080
Faridabad        251
Ghaziabad         25
Bhubaneshwar      21
Amritsar          21
Ahmedabad         21
Lucknow           21
Guwahati          21
Name: count, dtype: int64
In [43]:
# Top cuisines with the highest number of restaurants

top_cuisines=cuisines_count.head(10)
print('Top 10 cuisines with the highest number of restaurants:')
print(top_cuisines)
Top 10 cuisines with the highest number of restaurants:
Cuisines
North Indian                      936
North Indian, Chinese             511
Chinese                           354
Fast Food                         354
North Indian, Mughlai             334
Cafe                              299
Bakery                            218
North Indian, Mughlai, Chinese    197
Bakery, Desserts                  170
Street Food                       149
Name: count, dtype: int64
In [45]:
city_counts = df['City'].value_counts()
print(city_counts)
City
New Delhi           5473
Gurgaon             1118
Noida               1080
Faridabad            251
Ghaziabad             25
                    ... 
Panchkula              1
Mc Millan              1
Mayfield               1
Macedon                1
Vineland Station       1
Name: count, Length: 141, dtype: int64

Level 1 - Task 3:-¶

Task : Geospatial Analysis

  • 1.Visualize the locations of restaurants on a map using latitude and longitude information
In [13]:
#pip install shapely geopandas
In [19]:
#pip install geopandas fiona pyproj
In [47]:
# Locations of restaurants on a map using latitude and longitude information
# Import the necessary libraries

from shapely.geometry import Point
import geopandas as gpd
from geopandas import GeoDataFrame
In [49]:
import folium
restaurant_map = folium.Map(location=[df['Latitude'].mean(), df['Longitude'].mean()], zoom_start=6)
for index, row in df.iterrows():
    popup_text = f"{row['Restaurant Name']} - Rating: {row['Votes']}"
    folium.Marker([row['Latitude'], row['Longitude']], popup=popup_text).add_to(restaurant_map)
restaurant_map.save('restaurant_map.html')

plt.scatter(df['Longitude'], df['Latitude'], s=df['Votes'] * 20, alpha=0.7)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Restaurant Locations and Ratings')
plt.show()

city_distribution = df['City'].value_counts()

plt.bar(city_distribution.index, city_distribution.values)
plt.xlabel('City')
plt.ylabel('Number of Restaurants')
plt.title('Restaurant Distribution across Cities')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
No description has been provided for this image
In [51]:
import seaborn as sns
plt.figure(figsize=(15, 6))
sns.barplot(x=city_counts.index,y=city_counts.values)
plt.xticks(rotation=90)
plt.xlabel("City")
plt.ylabel("Number of Restaurants")
plt.title("Distribution of Restaurants across Cities/Countries")
plt.show()
No description has been provided for this image
In [53]:
plt.figure(figsize=(8,5))

sns.countplot(y=df['City'],order=df.City.value_counts().head(10).index,palette='Set2')

plt.xlabel('Number of Restaurants')
plt.ylabel('name of Cities')
plt.title('Distribution of Restaurants Acress Cities')

plt.show()
C:\Users\dhana\AppData\Local\Temp\ipykernel_2304\850872848.py:3: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(y=df['City'],order=df.City.value_counts().head(10).index,palette='Set2')
No description has been provided for this image
In [55]:
map_location = [df['Latitude'].iloc[0], df['Longitude'].iloc[0]]
map_restaurent = folium.Map(location=map_location, zoom_start=12)

for index, row in df.iterrows():
    folium.Marker(location=[row['Latitude'], row['Longitude']],
                  popup=row['Restaurant Name'],
                  icon=folium.Icon(icon='cutlery', prefix='fa')).add_to(map_restaurent)

map_restaurent
Out[55]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Level 2¶

Task-1

In [57]:
table_booking_percentage = df['Has Table booking'].value_counts(normalize=True) * 100
online_delivery_percentage = df['Has Online delivery'].value_counts(normalize=True) * 100

print(f"Percentage of restaurants offering table booking:")
print(table_booking_percentage)

print(f"Percentage of restaurants offering online delivery:")
print(online_delivery_percentage)
Percentage of restaurants offering table booking:
Has Table booking
No     87.875615
Yes    12.124385
Name: proportion, dtype: float64
Percentage of restaurants offering online delivery:
Has Online delivery
No     74.337766
Yes    25.662234
Name: proportion, dtype: float64
In [59]:
average_rating_with_table_booking = df[df['Has Table booking'] == 'Yes']['Aggregate rating'].mean()
average_rating_without_table_booking = df[df['Has Table booking'] == 'No']['Aggregate rating'].mean()

print(f"Average rating of restaurants with table booking: {average_rating_with_table_booking:.2f}")
print(f"Average rating of restaurants without table booking: {average_rating_without_table_booking:.2f}")
Average rating of restaurants with table booking: 3.44
Average rating of restaurants without table booking: 2.56
In [61]:
online_delivery_by_price_range = df.groupby('Price range')['Has Online delivery'].value_counts(normalize=True).unstack() * 100

print("Percentage of restaurants offering online delivery by price range:")
print(online_delivery_by_price_range)
Percentage of restaurants offering online delivery by price range:
Has Online delivery         No        Yes
Price range                              
1                    84.225923  15.774077
2                    58.689367  41.310633
3                    70.809659  29.190341
4                    90.955631   9.044369
In [63]:
online_delivery_by_price_range.plot(kind='bar')
plt.xlabel('Price Range')
plt.ylabel('Number of Restaurants with Online Delivery')
plt.title('Availability of Online Delivery by Price Range')
plt.show()
No description has been provided for this image

Task-2

In [66]:
most_common_price_range = df['Price range'].mode().iloc[0]
print(f"Most Common Price Range: {most_common_price_range}")
Most Common Price Range: 1
In [113]:
average_rating_by_price_range = df.groupby('Price range')['Votes'].mean()
print("Average Rating by Price Range:")
print(average_rating_by_price_range)
Average Rating by Price Range:
Price range
1     44.597435
2    147.607131
3    443.860795
4    368.595563
Name: Votes, dtype: float64
In [115]:
average_rating_by_price_range = df.groupby('Price range')['Votes'].mean()
print("Average Rating by Price Range:")
print(average_rating_by_price_range)
Average Rating by Price Range:
Price range
1     44.597435
2    147.607131
3    443.860795
4    368.595563
Name: Votes, dtype: float64
In [117]:
highest_rating_color = average_rating_by_price_range.idxmax()
print(f"Color representing the highest average rating: {highest_rating_color}")
Color representing the highest average rating: 3

Task-3

In [120]:
df['NameLength'] = df['Restaurant Name'].apply(len)
df['NameLength']
Out[120]:
0       16
1       16
2       22
3        4
4       11
        ..
9546    11
9547    12
9548     5
9549    11
9550    24
Name: NameLength, Length: 9551, dtype: int64
In [122]:
df['Has Table booking'] = df['Has Table booking'].map({True: 1, False: 0})
df['Has Online delivery'] = df['Has Online delivery'].map({True: 1, False: 0})
print(df)
      Restaurant ID           Restaurant Name  Country Code              City  \
0           6317637          Le Petit Souffle           162       Makati City   
1           6304287          Izakaya Kikufuji           162       Makati City   
2           6300002    Heat - Edsa Shangri-La           162  Mandaluyong City   
3           6318506                      Ooma           162  Mandaluyong City   
4           6314302               Sambo Kojin           162  Mandaluyong City   
...             ...                       ...           ...               ...   
9546        5915730               Naml۱ Gurme           208         ��stanbul   
9547        5908749              Ceviz A��ac۱           208         ��stanbul   
9548        5915807                     Huqqa           208         ��stanbul   
9549        5916112               A���k Kahve           208         ��stanbul   
9550        5927402  Walter's Coffee Roastery           208         ��stanbul   

                                                Address  \
0     Third Floor, Century City Mall, Kalayaan Avenu...   
1     Little Tokyo, 2277 Chino Roces Avenue, Legaspi...   
2     Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...   
3     Third Floor, Mega Fashion Hall, SM Megamall, O...   
4     Third Floor, Mega Atrium, SM Megamall, Ortigas...   
...                                                 ...   
9546  Kemanke�� Karamustafa Pa��a Mahallesi, R۱ht۱m ...   
9547  Ko��uyolu Mahallesi, Muhittin ��st�_nda�� Cadd...   
9548  Kuru�_e��me Mahallesi, Muallim Naci Caddesi, N...   
9549  Kuru�_e��me Mahallesi, Muallim Naci Caddesi, N...   
9550  Cafea��a Mahallesi, Bademalt۱ Sokak, No 21/B, ...   

                                        Locality  \
0      Century City Mall, Poblacion, Makati City   
1     Little Tokyo, Legaspi Village, Makati City   
2     Edsa Shangri-La, Ortigas, Mandaluyong City   
3         SM Megamall, Ortigas, Mandaluyong City   
4         SM Megamall, Ortigas, Mandaluyong City   
...                                          ...   
9546                                    Karak�_y   
9547                                   Ko��uyolu   
9548                                 Kuru�_e��me   
9549                                 Kuru�_e��me   
9550                                        Moda   

                                       Locality Verbose   Longitude  \
0     Century City Mall, Poblacion, Makati City, Mak...  121.027535   
1     Little Tokyo, Legaspi Village, Makati City, Ma...  121.014101   
2     Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...  121.056831   
3     SM Megamall, Ortigas, Mandaluyong City, Mandal...  121.056475   
4     SM Megamall, Ortigas, Mandaluyong City, Mandal...  121.057508   
...                                                 ...         ...   
9546                                Karak�_y, ��stanbul   28.977392   
9547                               Ko��uyolu, ��stanbul   29.041297   
9548                             Kuru�_e��me, ��stanbul   29.034640   
9549                             Kuru�_e��me, ��stanbul   29.036019   
9550                                    Moda, ��stanbul   29.026016   

       Latitude                          Cuisines  ...  Has Table booking  \
0     14.565443        French, Japanese, Desserts  ...                NaN   
1     14.553708                          Japanese  ...                NaN   
2     14.581404  Seafood, Asian, Filipino, Indian  ...                NaN   
3     14.585318                   Japanese, Sushi  ...                NaN   
4     14.584450                  Japanese, Korean  ...                NaN   
...         ...                               ...  ...                ...   
9546  41.022793                           Turkish  ...                NaN   
9547  41.009847   World Cuisine, Patisserie, Cafe  ...                NaN   
9548  41.055817            Italian, World Cuisine  ...                NaN   
9549  41.057979                   Restaurant Cafe  ...                NaN   
9550  40.984776                              Cafe  ...                NaN   

     Has Online delivery  Is delivering now  Switch to order menu Price range  \
0                    NaN                 No                    No           3   
1                    NaN                 No                    No           3   
2                    NaN                 No                    No           4   
3                    NaN                 No                    No           4   
4                    NaN                 No                    No           4   
...                  ...                ...                   ...         ...   
9546                 NaN                 No                    No           3   
9547                 NaN                 No                    No           3   
9548                 NaN                 No                    No           4   
9549                 NaN                 No                    No           4   
9550                 NaN                 No                    No           2   

     Aggregate rating  Rating color  Rating text Votes NameLength  
0                 4.8    Dark Green    Excellent   314         16  
1                 4.5    Dark Green    Excellent   591         16  
2                 4.4         Green    Very Good   270         22  
3                 4.9    Dark Green    Excellent   365          4  
4                 4.8    Dark Green    Excellent   229         11  
...               ...           ...          ...   ...        ...  
9546              4.1         Green    Very Good   788         11  
9547              4.2         Green    Very Good  1034         12  
9548              3.7        Yellow         Good   661          5  
9549              4.0         Green    Very Good   901         11  
9550              4.0         Green    Very Good   591         24  

[9551 rows x 22 columns]

Level 3¶

Task-1

In [128]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv('Dataset .csv')
In [130]:
non_numeric_cols = df.select_dtypes(exclude=[float, int]).columns.tolist()
print("Columns with non-numeric values:")
print(non_numeric_cols)
Columns with non-numeric values:
['Restaurant Name', 'City', 'Address', 'Locality', 'Locality Verbose', 'Cuisines', 'Currency', 'Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu', 'Rating color', 'Rating text']
In [132]:
df.drop(columns=non_numeric_cols, inplace=True)
In [134]:
X = df.drop(columns=['Aggregate rating']) 
y = df['Aggregate rating']
In [136]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [138]:
def train_and_evaluate_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2
In [140]:
linear_regression = LinearRegression()
mse_lr, r2_lr = train_and_evaluate_model(linear_regression)
print(f"Linear Regression - MSE: {mse_lr:.2f}, R2 Score: {r2_lr:.2f}")
Linear Regression - MSE: 1.58, R2 Score: 0.31
In [142]:
decision_tree = DecisionTreeRegressor()
mse_dt, r2_dt = train_and_evaluate_model(decision_tree)
print(f"Decision Tree Regression - MSE: {mse_dt:.2f}, R2 Score: {r2_dt:.2f}")
Decision Tree Regression - MSE: 0.15, R2 Score: 0.93
In [144]:
random_forest = RandomForestRegressor()
mse_rf, r2_rf = train_and_evaluate_model(random_forest)
print(f"Random Forest Regression - MSE: {mse_rf:.2f}, R2 Score: {r2_rf:.2f}")
Random Forest Regression - MSE: 0.08, R2 Score: 0.97

Task-2

In [148]:
df = pd.read_csv('Dataset .csv')

average_rating_by_cuisine = df.groupby('Cuisines')['Votes'].mean()

average_rating_by_cuisine.plot(kind='bar')
plt.xlabel('Cuisines')
plt.ylabel('Average rating')
plt.title('Average Rating by Cuisine Type')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [150]:
most_popular_cuisines = df.groupby('Cuisines')['Votes'].sum().sort_values(ascending=False)

most_popular_cuisines.plot(kind='bar')
plt.xlabel('Cuisines')
plt.ylabel('Number of Votes')
plt.title('Most Popular Cuisines based on Votes')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [154]:
average_rating_by_price_range = df.groupby('Price range')['Aggregate rating'].mean()

highest_rating_price_range = average_rating_by_price_range.idxmax()

sns.barplot(x=average_rating_by_price_range.index, y=average_rating_by_price_range.values)
plt.xlabel('Price range')
plt.ylabel('Average Rating')
plt.title('Average Rating for Each Price Range')

highest_rating_index = average_rating_by_price_range.index.get_loc(highest_rating_price_range)
plt.bar(highest_rating_index, average_rating_by_price_range[highest_rating_price_range], color='red')

plt.show()
No description has been provided for this image

Task-3

In [157]:
plt.figure(figsize=(8, 6))
plt.hist(df['Aggregate rating'], bins=20, edgecolor='k')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.title('Distribution of Ratings')
plt.show()
No description has been provided for this image
In [159]:
plt.figure(figsize=(12, 6))
average_ratings_by_cuisine = df.groupby('Cuisines')['Aggregate rating'].mean().sort_values(ascending=False)
sns.barplot(x=average_ratings_by_cuisine.index, y=average_ratings_by_cuisine.values)
plt.xlabel('Cuisine')
plt.ylabel('Average Rating')
plt.title('Average Ratings by Cuisine')
plt.xticks(rotation=90)
plt.show()
No description has been provided for this image
In [161]:
plt.figure(figsize=(12, 6))
average_ratings_by_city = df.groupby('City')['Aggregate rating'].mean().sort_values(ascending=False)
sns.barplot(x=average_ratings_by_city.index, y=average_ratings_by_city.values, palette='magma')
plt.xlabel('City')
plt.ylabel('Average Rating')
plt.title('Average Ratings by City')
plt.xticks(rotation=90)
plt.show()
No description has been provided for this image
In [163]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='Cuisines', y='Aggregate rating', data=df, palette='Set3')
plt.xlabel('Cuisine')
plt.ylabel('Rating')
plt.title('Relationship between Cuisine and Rating')
plt.xticks(rotation=90)
plt.show()
No description has been provided for this image
In [165]:
plt.figure(figsize=(8, 6))
sns.boxplot(x='Price range', y='Aggregate rating', data=df, palette='Paired')
plt.xlabel('Price Range')
plt.ylabel('Rating')
plt.title('Relationship between Price Range and Rating')
plt.show()
No description has been provided for this image

Level-3¶

Task1 - Predictive Modeling

Build a regression model to predict the aggregate rating of a restaurant based on available features.

In [181]:
non_numeric_cols = df.select_dtypes(exclude=[float, int]).columns.tolist()
print("Columns with non-numeric values:")
print(non_numeric_cols)

df.drop(columns=non_numeric_cols, inplace=True)

X= df.drop(columns=['Aggregate rating']) 
y= df['Aggregate rating']
Columns with non-numeric values:
['Restaurant Name', 'City', 'Address', 'Locality', 'Locality Verbose', 'Cuisines', 'Currency', 'Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu', 'Rating color', 'Rating text']

Split the dataset into training and testing sets and evaluate the model's performance using appropriate metrics.

In [184]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to train and evaluate a regression model
def train_and_evaluate_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2

Experiment with different algorithms (e.g., linear regression, decision trees, random forest) and compare their performance

In [189]:
# Training and evaluating Linear Regression model
linear_regression = LinearRegression()
mse_lr, r2_lr = train_and_evaluate_model(linear_regression)

# Training and evaluating Decision Tree Regression model
decision_tree = DecisionTreeRegressor()
mse_dt, r2_dt = train_and_evaluate_model(decision_tree)

# Training and evaluating Random Forest Regression model
random_forest = RandomForestRegressor()
mse_rf, r2_rf = train_and_evaluate_model(random_forest)

# Print the performance metrics
print(f"Linear Regression - MSE: {mse_lr:.2f}, R2 Score: {r2_lr:.2f}")
print(f"Decision Tree Regression - MSE: {mse_dt:.2f}, R2 Score: {r2_dt:.2f}")
print(f"Random Forest Regression - MSE: {mse_rf:.2f}, R2 Score: {r2_rf:.2f}")
Linear Regression - MSE: 1.58, R2 Score: 0.31
Decision Tree Regression - MSE: 0.15, R2 Score: 0.93
Random Forest Regression - MSE: 0.07, R2 Score: 0.97

Level-2 Customer Preference Analysis

Analyze the relationship between the type of 0cuisine and the restaurant's rating.

In [196]:
# Analyzing the relationship between the type of cuisine and the restaurant's rating.
#average_rating_by_cuisine = df.groupby('Cuisines')['Votes'].mean()

# Plot the average rating by cuisine
average_rating_by_cuisine.plot(kind='bar')
plt.xlabel('Cuisines')
plt.ylabel('Average rating')
plt.title('Average Rating by Cuisine Type')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image

Identify the most popular cuisines amongcustomers based on the number of votes.

In [201]:
# Identifying the most popular cuisines among customers based on the number of votes.
#most_popular_cuisines = df.groupby('Cuisines')['Votes'].sum().sort_values(ascending=False)

# Plot the number of votes by cuisine
most_popular_cuisines.plot(kind='bar')
plt.xlabel('Cuisines')
plt.ylabel('Number of Votes')
plt.title('Most Popular Cuisines based on Votes')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image

Determine if there are any specific cuisines that tend to receive higher ratings.

Task-3 Data Visualization

Create visualizations to represent the distribution of ratings using different charts (histogram, bar plot, etc.).

In [210]:
# Distribution of ratings using a histogram
plt.figure(figsize=(8, 6))
plt.hist(df['Aggregate rating'], bins=20, edgecolor='k')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.title('Distribution of Ratings')
plt.show()
No description has been provided for this image

Compare the average ratings of different cuisines or cities using appropriate visualizations.

In [229]:
# Comparison of average ratings of different cuisines using a bar plot
plt.figure(figsize=(12, 6))
#average_ratings_by_cuisine = df.groupby('Cuisines')['Aggregate rating'].mean().sort_values(ascending=False)
sns.barplot(x=average_ratings_by_cuisine.index, y=average_ratings_by_cuisine.values, palette='viridis')
plt.xlabel('Cuisine')
plt.ylabel('Average Rating')
plt.title('Average Ratings by Cuisine')
plt.xticks(rotation=90)
plt.show()

# Comparison of average ratings of different cities using a bar plot
plt.figure(figsize=(12, 6))
#average_ratings_by_city = df.groupby('City')['Aggregate rating'].mean().sort_values(ascending=False)
sns.barplot(x=average_ratings_by_city.index, y=average_ratings_by_city.values, palette='magma')
plt.xlabel('City')
plt.ylabel('Average Rating')
plt.title('Average Ratings by City')
plt.xticks(rotation=90)
plt.show()
No description has been provided for this image
No description has been provided for this image

Visualize the relationship between various features and the target variable to gain insights.

In [233]:
# Visualization of the relationship between various features and the target variable
plt.figure(figsize=(12, 6))
sns.boxplot(x='Cuisines', y='Aggregate rating', data=df, palette='Set3')
plt.xlabel('Cuisine')
plt.ylabel('Rating')
plt.title('Relationship between Cuisine and Rating')
plt.xticks(rotation=90)
plt.show()

plt.figure(figsize=(8, 6))
sns.boxplot(x='Price range', y='Aggregate rating', data=df, palette='Paired')
plt.xlabel('Price Range')
plt.ylabel('Rating')
plt.title('Relationship between Price Range and Rating')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [ ]: